library(nycflights13)
library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(here)
## here() starts at /Users/mateosarabia/Documents/data sci/ds241_f23/Sarabia-Portfolio

#> ── Attaching core tidyverse packages ───────────────────── tidyverse 2.0.0 ── #> ✔ dplyr 1.1.3 ✔ readr 2.1.4 #> ✔ forcats 1.0.0 ✔ stringr 1.5.0 #> ✔ ggplot2 3.4.3 ✔ tibble 3.2.1 #> ✔ lubridate 1.9.2 ✔ tidyr 1.3.0 #> ✔ purrr 1.0.2
#> ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ── #> ✖ dplyr::filter() masks stats::filter() #> ✖ dplyr::lag() masks stats::lag() #> ℹ Use the conflicted package (http://conflicted.r-lib.org/) to force all conflicts to become errors #>

nycflights13::flights
df1 = flights
glimpse(df1)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
# task 1: flights from Miami in September
df2= df1 |> filter(month == 9, origin == "MIA")

# task 2: flights in September going to Miami
df3 = df1 |> filter(month == 9, dest == "MIA")

# task 2a: flights in September going to MAI
df3a = df1 |> filter(month == 9, dest == "MAI")

# task 3: flights in January going to Miami
df4 = df1 |> filter(month == 1, dest == "MIA")

# task 4: flights in Summer going to Chicago
df5 = df1 |> filter(between(month, 6, 8), dest %in% c("ORD","MDW") )
# Sometimes flight numbers are repeated, for flights to Miami in September, find the unique flight number
#df6 = df6 |> filter(month == 9, dest == "MIA")
#flight_numbers <- unique(df3$flight)
numbers = unique(df3$flight)
min(numbers)
## [1] 83
# create a data frame df6 composed of flights to Miami with a flight number matching the smallest flight number from the set above

df6 = df1 |> filter(dest == "MIA", flight == 83)


#Create a visualization (using ggplot) to show the DELAYS associated with this flight number

ggplot(data = df6,
       mapping = aes(x=dep_delay, y =arr_delay)) + geom_point()

is there correlation between departure delays with miami opposed to newark?

which airport has the least amount of departure delays?

df1 |>
  filter(dest=="MIA") |>
count(origin,sort=TRUE)

Is flight time affected by delay departure.

df7=df1 |>
filter(dest=="MIA",origin=="LGA") |>
  mutate(flt_delta=arr_delay-dep_delay)
df7 |>
  ggplot(aes(x=dep_delay,y=flt_delta)) +geom_point(alpha=.1)
## Warning: Removed 79 rows containing missing values (`geom_point()`).

df7 |>
  ggplot(aes(x=dep_delay,y=flt_delta)) +geom_point(alpha=.1)+
  geom_hline(aes(yintercept=mean(flt_delta,na.rm=TRUE)))
## Warning: Removed 79 rows containing missing values (`geom_point()`).

## is departure delay affected by time of year?

df7 |>
  ggplot(aes(x=time_hour,y=dep_delay)) +geom_point(alpha=.1)
## Warning: Removed 56 rows containing missing values (`geom_point()`).

df7 |>
  ggplot(aes(x=time_hour,y=dep_delay)) +
  geom_point(alpha=.1)+
  stat_smooth()+
  ylim(-25,120)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 168 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 168 rows containing missing values (`geom_point()`).

## does departure delay change across time of day?

df7 |>
  ggplot(aes(x=hour+minute/60,y=dep_delay)) +
  geom_point(alpha=.1)+
  stat_smooth()+
  ylim(-25,120)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 168 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 168 rows containing missing values (`geom_point()`).

df7 |>
  mutate(day_of_week=weekdays(time_hour)) |>
  ggplot(aes(x=hour+minute/60,y=dep_delay,color=day_of_week)) +
  geom_point(alpha=.1)+
  stat_smooth()+
  ylim(-25,120)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 168 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 168 rows containing missing values (`geom_point()`).

df7 |>
  mutate(day_of_week=weekdays(time_hour)) |>
  ggplot(aes(x=hour+minute/60,y=dep_delay,color=day_of_week)) +
  geom_point(alpha=.1)+
  stat_smooth()+
  ylim(-20,40)+
facet_wrap(~day_of_week)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 478 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 478 rows containing missing values (`geom_point()`).